





In machine learning and statistics, classification is the problem of identifying to which of a set of categories (sub-populations) a new observation belongs, on the basis of a training set of data containing observations (or instances) whose category membership is known.
from aihub import aihub
import matplotlib
font = {'family' : 'normal',
'weight' : 'regular',
'size' : 12}
matplotlib.rc('font', **font)
aihub.example_classification()
Cluster analysis or clustering is the task of grouping a set of objects in such a way that objects in the same group (called a cluster) are more similar (in some sense) to each other than to those in other groups (clusters). It is a main task of exploratory data mining.
aihub.example_cluster()
Regression: In statistical modeling, regression analysis is a set of statistical processes for estimating the relationships among variables.
Lasso Regression: In statistics and machine learning, lasso (least absolute shrinkage and selection operator; also Lasso or LASSO) is a regression analysis method that performs both variable selection and regularization in order to enhance the prediction accuracy and interpretability of the statistical model it produces.
Support Vector Machines: In machine learning, support vector machines (SVMs, also support vector networks) are supervised learning models with associated learning algorithms that analyze data used for classification and regression analysis
import warnings
import pandas as pd
import matplotlib
import os
os.chdir("d:/machine-learning/anahub")
# developed by Usman Ahmad
from datahub import datahub
from auxhub import auxhub
from edahub import edahub
from algohub import algohub
from aihub import aihub
dh = datahub()
aapl_file = "d:/data/NASDAQ_AAPL.txt"
dh.add_dataset("aapl", aapl_file)
dh.get_dataset("aapl").head()
In machine learning and statistics, exploratory data analysis (EDA) is an approach to analyzing data sets to summarize their main characteristics, often with visual methods.
edahub.init_analysis(dh.get_dataset("aapl"), "Close")
edahub.get_correlations(dh.get_dataset("aapl"))
aih = aihub()
aih.load_data(dh)
aih.datahub.get_names()
Algo's: Multi-variate regression, Lasso regression, Support Vector Machines (Linear, Polynomial)
warnings.simplefilter("ignore")
DAYS_PREDICT = 500
aih.run_regression(
["aapl", "aapl",],
["Close", "Volume",],
[DAYS_PREDICT, DAYS_PREDICT],
algos = ["reg", "svr_linear", "svr_poly", "lasso"],
test_sizes = [0.2],
filldrop = ["fill"],
force_encoding = "le"
)
dorig = aih.datahub.get_dataset("aapl")
dpred_reg = aih.datahub.get_dataset("aapl.regression.svr_linear.Close.{0}.[0.2].prediction".format(DAYS_PREDICT))
dpred_poly = aih.datahub.get_dataset("aapl.regression.svr_poly.Close.{0}.[0.2].prediction".format(DAYS_PREDICT))
edahub.plot_regression(dorig, dpred_reg, "Close", xlab = "Days", title = "SVR Linear")
edahub.plot_regression(dorig, dpred_poly, "Close", xlab = "Days", title = "SVR Polynomial")
dk = datahub()
dk.add_datasets({
"kmart.train": "d:/data/kmart/train_kmart.csv",
"kmart.test": "d:/data/kmart/test_kmart.csv"})
display(dk.get_dataset("kmart.train").head())
display(dk.get_dataset("kmart.test").head())
edahub.init_analysis(dk.get_dataset("kmart.train"), "Item_Outlet_Sales")
edahub.missing_values_table(dk.get_dataset("kmart.train"))
NUM_CLASSES = 2
dk.add_datasets({
"kmart.cut.train": auxhub.classify_split(
dk.get_dataset("kmart.train"), "Item_Outlet_Sales", "Target_Sales",
method = "cut", num_classes = NUM_CLASSES),
"kmart.pct.train": auxhub.classify_split(
dk.get_dataset("kmart.train"), "Item_Outlet_Sales", "Target_Sales",
method = "pct", num_classes = NUM_CLASSES),
})
edahub.plot_frequency(dk.get_dataset("kmart.cut.train"), "Target_Sales")
edahub.plot_frequency(dk.get_dataset("kmart.pct.train"), "Target_Sales")
dk.encode([('kmart.cut.train', "Target_Sales"), ('kmart.pct.train', "Target_Sales")],
['kmart.test'], target_cols = ['Item_Outlet_Sales', 'Target_Sales'], encoding_arr = ["le", "ohe"])
# one-hot encoding
dk.get_dataset("kmart.pct.train.Target_Sales.ohe.features").head()
# integer encoding
dk.get_dataset("kmart.pct.train.Target_Sales.le.features").head()
Feature scaling is a method used to standardize the range of independent variables or features of data. In data processing, it is also known as data normalisation and is generally performed during the data preprocessing step.
Since the range of values of raw data varies widely, in some machine learning algorithms, objective functions will not work properly without normalisation.
# one-hot encoded featured, labels, test data to predict
d2 = dk
ohe_features = [x for x in d2.get_names() if "features" in x and "train" in x and ".ohe." in x and not "full" in x]
ohe_labels = [x for x in d2.get_names() if "labels" in x and ".ohe." in x]
ohe_test_features = [x for x in d2.get_names() if "features" in x and "test" in x and ".ohe." in x and not "full" in x]
ohe_zlf = list(zip(ohe_features, ohe_labels))
# integer encoded featured, labels, test data to predict
le_features = [x for x in d2.get_names() if "features" in x and "train" in x and ".le." in x and not "full" in x]
le_labels = [x for x in d2.get_names() if "labels" in x and ".le." in x]
le_test_features = [x for x in d2.get_names() if "features" in x and "test" in x and ".le." in x and not "full" in x]
le_zlf = list(zip(le_features, le_labels))
dk.run_imputer_scaler(ohe_zlf, ohe_test_features)
dk.run_imputer_scaler(le_zlf, le_test_features)
KDE is a method of inferring the relationship of field in question against the target column. It is a non-parametric method of estimating the probability density function (PDF) of a continuous random variable. It is non-parametric because it does not assume any underlying distribution for the variable.
font = {'family' : 'normal',
'weight' : 'regular',
'size' : 10}
matplotlib.rc('font', **font)
df1 = dk.get_dataset('kmart.pct.train.Target_Sales.le.full.features')
edahub.plot_density_all(df1, "Target_Sales", NUM_CLASSES)
edahub.plot_density_all(df1, "Item_Outlet_Sales", NUM_CLASSES)
groups = ["Item", "Outlet"]
data_sets = auxhub.seperate_data(df1, groups)
edahub.corr_heatmap(df1)
edahub.corr_heatmap(data_sets["Item"])
edahub.corr_heatmap(data_sets["Outlet"])
item_data = pd.concat([ data_sets['Item'], df1[[ "Item_Outlet_Sales", "Target_Sales" ]] ], axis = 1)
edahub.pairs_plot(item_data, "Item_Outlet_Sales", "Target_Sales")
outlet_data = pd.concat([ data_sets['Outlet'], df1[[ "Item_Outlet_Sales", "Target_Sales" ]] ], axis = 1)
edahub.pairs_plot(outlet_data, "Item_Outlet_Sales", "Target_Sales")
df2 = dk.get_dataset('kmart.pct.train.Target_Sales.le.features.impsca')
edahub.elbow_curve(df2, algo = "kmeans", max_c = 35)
aih2 = aihub()
aih2.load_data(dk)
warnings.simplefilter("ignore")
aih2.run_classifier(
["kmart.pct.train.Target_Sales.le.features.impsca","kmart.pct.train.Target_Sales.ohe.features.impsca"],
["kmart.test.Target_Sales.le.features.impsca","kmart.test.Target_Sales.ohe.features.impsca"],
["kmart.pct.train.Target_Sales.le.labels","kmart.pct.train.Target_Sales.ohe.labels"],
algos = ["dtree", "rforest", "gauss", "quad"]
)
pred = aih2.datahub.get_dataset('kmart.pct.train.Target_Sales.ohe.features.impsca.classifier.dtree.prediction')
zscore = aih2.datahub.get_dataset('kmart.pct.train.Target_Sales.ohe.features.impsca.classifier.dtree.Z-score')
new_df = aih2.datahub.get_dataset('kmart.test')
pred = pred[list(pred.columns)[0]]
new_df['Target_Sales_Prediction'] = pred
new_df['Z-Score-0'] = zscore[0]
new_df['Z-Score-1'] = zscore[1]
new_df.head(20)
df00 = aih2.datahub.get_dataset('kmart.pct.train.Target_Sales.le.features')
df02 = aih2.datahub.get_dataset('kmart.pct.train.Target_Sales.le.features.impsca')
df03 = aih2.datahub.get_dataset('kmart.pct.train.Target_Sales.le.labels')
edahub.plot_classifiers(df02, df03, list(df00.columns), "Item_Visibility", "Item_MRP", NUM_CLASSES)
aih2.datahub.get_dataset("kmart.pct.train.Target_Sales.le.features").head()
cols = list(aih2.datahub.get_dataset("kmart.pct.train.Target_Sales.le.features").columns)
cols
aih2.datahub.get_dataset("kmart.pct.train.Target_Sales.le.features.impsca")
aih2.datahub.get_dataset("kmart.test.Target_Sales.le.features.impsca")
select_cols = ["Item_MRP", "Outlet_Establishment_Year"]
#select_cols = ["Item_MRP"]
item_mrp_idxs = auxhub.get_indices(cols, select_cols)
aih3 = aihub()
dk2 = datahub()
dk2.add_dataset("k_label", aih2.datahub.get_dataset("kmart.pct.train.Target_Sales.le.labels"))
dk2.add_dataset("k_test", aih2.datahub.get_dataset("kmart.test"))
dk2.add_dataset("k_test_impsca", aih2.datahub.get_dataset("kmart.test.Target_Sales.le.features.impsca")[:, item_mrp_idxs])
dk2.add_dataset("k_mrp_impsca", aih2.datahub.get_dataset(
"kmart.pct.train.Target_Sales.le.features.impsca")[:, item_mrp_idxs])
aih3.load_data(dk2)
aih3.datahub.get_dataset("k_test_impsca")
warnings.simplefilter("ignore")
aih3.run_classifier(
["k_mrp_impsca"],
["k_test_impsca"],
["k_label"],
algos = ["dtree", "rforest", "gauss", "quad"]
)
aih3.datahub.get_names()
pred2 = aih3.datahub.get_dataset('k_mrp_impsca.classifier.rforest.prediction')
zscore2 = aih3.datahub.get_dataset('k_mrp_impsca.classifier.rforest.valid-Z-score')
new_df2 = pd.DataFrame(aih3.datahub.get_dataset('k_test'), columns = select_cols)
pred2 = pred2[list(pred2.columns)[0]]
new_df2['Target_Sales_Prediction'] = pred2
new_df2['Z-Score-0'] = zscore2[0]
new_df2['Z-Score-1'] = zscore2[1]
new_df2.head(20)